House Prices - Advanced Regression Techniques

Logo from Kaggle.

In [1]:
# Basic Libraries to import

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import math
from scipy.stats import norm, skew
import os
from quickda.explore_data import *

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
In [2]:
pd.pandas.set_option('display.max_columns',None)
pd.pandas.set_option('display.max_rows',90)
pd.options.display.float_format = '{:.5f}'.format

# Setting seaborn style
sns.set_palette('Reds')
sns.set_style('darkgrid')
In [3]:
!pip -q install kaggle

from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
In [4]:
api.competitions_list(category='gettingStarted')
Out[4]:
[contradictory-my-dear-watson,
 gan-getting-started,
 tpu-getting-started,
 digit-recognizer,
 titanic,
 house-prices-advanced-regression-techniques,
 connectx,
 nlp-getting-started,
 facial-keypoints-detection,
 street-view-getting-started-with-julia,
 word2vec-nlp-tutorial,
 data-science-london-scikit-learn,
 just-the-basics-the-after-party,
 just-the-basics-strata-2013]
In [5]:
api.competition_download_files('house-prices-advanced-regression-techniques')
In [6]:
from zipfile import ZipFile
zf = ZipFile('house-prices-advanced-regression-techniques.zip')
zf.extractall('data\\')
zf.close()
In [7]:
housePrices = pd.read_csv('./data/train.csv')
train_index = housePrices['Id']
target = housePrices['SalePrice']
target_col = 'SalePrice'
housePrices.head()
Out[7]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.00000 8450 Pave NaN Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.00000 Gd TA PConc Gd TA No GLQ 706 Unf 0 150 856 GasA Ex Y SBrkr 856 854 0 1710 1 0 2 1 3 1 Gd 8 Typ 0 NaN Attchd 2003.00000 RFn 2 548 TA TA Y 0 61 0 0 0 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.00000 9600 Pave NaN Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.00000 TA TA CBlock Gd TA Gd ALQ 978 Unf 0 284 1262 GasA Ex Y SBrkr 1262 0 0 1262 0 1 2 0 3 1 TA 6 Typ 1 TA Attchd 1976.00000 RFn 2 460 TA TA Y 298 0 0 0 0 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.00000 11250 Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.00000 Gd TA PConc Gd TA Mn GLQ 486 Unf 0 434 920 GasA Ex Y SBrkr 920 866 0 1786 1 0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001.00000 RFn 2 608 TA TA Y 0 42 0 0 0 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.00000 9550 Pave NaN IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.00000 TA TA BrkTil TA Gd No ALQ 216 Unf 0 540 756 GasA Gd Y SBrkr 961 756 0 1717 1 0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.00000 Unf 3 642 TA TA Y 0 35 272 0 0 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.00000 14260 Pave NaN IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.00000 Gd TA PConc Gd TA Av GLQ 655 Unf 0 490 1145 GasA Ex Y SBrkr 1145 1053 0 2198 1 0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000.00000 RFn 3 836 TA TA Y 192 84 0 0 0 0 NaN NaN NaN 0 12 2008 WD Normal 250000
In [8]:
housePrices.shape
Out[8]:
(1460, 81)
In [9]:
housePrices.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
In [10]:
test_df = pd.read_csv('./data/test.csv')
test_index = test_df['Id']
test_df.head()
Out[10]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1461 20 RH 80.00000 11622 Pave NaN Reg Lvl AllPub Inside Gtl NAmes Feedr Norm 1Fam 1Story 5 6 1961 1961 Gable CompShg VinylSd VinylSd None 0.00000 TA TA CBlock TA TA No Rec 468.00000 LwQ 144.00000 270.00000 882.00000 GasA TA Y SBrkr 896 0 0 896 0.00000 0.00000 1 0 2 1 TA 5 Typ 0 NaN Attchd 1961.00000 Unf 1.00000 730.00000 TA TA Y 140 0 0 0 120 0 NaN MnPrv NaN 0 6 2010 WD Normal
1 1462 20 RL 81.00000 14267 Pave NaN IR1 Lvl AllPub Corner Gtl NAmes Norm Norm 1Fam 1Story 6 6 1958 1958 Hip CompShg Wd Sdng Wd Sdng BrkFace 108.00000 TA TA CBlock TA TA No ALQ 923.00000 Unf 0.00000 406.00000 1329.00000 GasA TA Y SBrkr 1329 0 0 1329 0.00000 0.00000 1 1 3 1 Gd 6 Typ 0 NaN Attchd 1958.00000 Unf 1.00000 312.00000 TA TA Y 393 36 0 0 0 0 NaN NaN Gar2 12500 6 2010 WD Normal
2 1463 60 RL 74.00000 13830 Pave NaN IR1 Lvl AllPub Inside Gtl Gilbert Norm Norm 1Fam 2Story 5 5 1997 1998 Gable CompShg VinylSd VinylSd None 0.00000 TA TA PConc Gd TA No GLQ 791.00000 Unf 0.00000 137.00000 928.00000 GasA Gd Y SBrkr 928 701 0 1629 0.00000 0.00000 2 1 3 1 TA 6 Typ 1 TA Attchd 1997.00000 Fin 2.00000 482.00000 TA TA Y 212 34 0 0 0 0 NaN MnPrv NaN 0 3 2010 WD Normal
3 1464 60 RL 78.00000 9978 Pave NaN IR1 Lvl AllPub Inside Gtl Gilbert Norm Norm 1Fam 2Story 6 6 1998 1998 Gable CompShg VinylSd VinylSd BrkFace 20.00000 TA TA PConc TA TA No GLQ 602.00000 Unf 0.00000 324.00000 926.00000 GasA Ex Y SBrkr 926 678 0 1604 0.00000 0.00000 2 1 3 1 Gd 7 Typ 1 Gd Attchd 1998.00000 Fin 2.00000 470.00000 TA TA Y 360 36 0 0 0 0 NaN NaN NaN 0 6 2010 WD Normal
4 1465 120 RL 43.00000 5005 Pave NaN IR1 HLS AllPub Inside Gtl StoneBr Norm Norm TwnhsE 1Story 8 5 1992 1992 Gable CompShg HdBoard HdBoard None 0.00000 Gd TA PConc Gd TA No ALQ 263.00000 Unf 0.00000 1017.00000 1280.00000 GasA Ex Y SBrkr 1280 0 0 1280 0.00000 0.00000 2 0 2 1 Gd 5 Typ 0 NaN Attchd 1992.00000 RFn 2.00000 506.00000 TA TA Y 0 82 0 0 144 0 NaN NaN NaN 0 1 2010 WD Normal
In [11]:
housePrices.shape, test_df.shape
Out[11]:
((1460, 81), (1459, 80))
In [12]:
from IPython.display import display
with pd.option_context('display.max_rows', 100, 'display.max_columns', 100):
    display(explore(housePrices, method="summarize"))
dtypes count null_sum null_pct nunique min 25% 50% 75% max mean median std skew
1stFlrSF int64 1460 0 0.00000 753 334 882.00000 1087.00000 1391.25000 4692 1162.62671 1087.00000 386.58774 1.37676
2ndFlrSF int64 1460 0 0.00000 417 0 0.00000 0.00000 728.00000 2065 346.99247 0.00000 436.52844 0.81303
3SsnPorch int64 1460 0 0.00000 20 0 0.00000 0.00000 0.00000 508 3.40959 0.00000 29.31733 10.30434
Alley object 91 1369 0.93800 2 - - - - - - - - -
BedroomAbvGr int64 1460 0 0.00000 8 0 2.00000 3.00000 3.00000 8 2.86644 3.00000 0.81578 0.21179
BldgType object 1460 0 0.00000 5 1Fam - - - TwnhsE - - - -
BsmtCond object 1423 37 0.02500 4 - - - - - - - - -
BsmtExposure object 1422 38 0.02600 4 - - - - - - - - -
BsmtFinSF1 int64 1460 0 0.00000 637 0 0.00000 383.50000 712.25000 5644 443.63973 383.50000 456.09809 1.68550
BsmtFinSF2 int64 1460 0 0.00000 144 0 0.00000 0.00000 0.00000 1474 46.54932 0.00000 161.31927 4.25526
BsmtFinType1 object 1423 37 0.02500 6 - - - - - - - - -
BsmtFinType2 object 1422 38 0.02600 6 - - - - - - - - -
BsmtFullBath int64 1460 0 0.00000 4 0 0.00000 0.00000 1.00000 3 0.42534 0.00000 0.51891 0.59607
BsmtHalfBath int64 1460 0 0.00000 3 0 0.00000 0.00000 0.00000 2 0.05753 0.00000 0.23875 4.10340
BsmtQual object 1423 37 0.02500 4 - - - - - - - - -
BsmtUnfSF int64 1460 0 0.00000 780 0 223.00000 477.50000 808.00000 2336 567.24041 477.50000 441.86696 0.92027
CentralAir object 1460 0 0.00000 2 N - - - Y - - - -
Condition1 object 1460 0 0.00000 9 Artery - - - RRNn - - - -
Condition2 object 1460 0 0.00000 8 Artery - - - RRNn - - - -
Electrical object 1459 1 0.00100 5 - - - - - - - - -
EnclosedPorch int64 1460 0 0.00000 120 0 0.00000 0.00000 0.00000 552 21.95411 0.00000 61.11915 3.08987
ExterCond object 1460 0 0.00000 5 Ex - - - TA - - - -
ExterQual object 1460 0 0.00000 4 Ex - - - TA - - - -
Exterior1st object 1460 0 0.00000 15 AsbShng - - - WdShing - - - -
Exterior2nd object 1460 0 0.00000 16 AsbShng - - - Wd Shng - - - -
Fence object 281 1179 0.80800 4 - - - - - - - - -
FireplaceQu object 770 690 0.47300 5 - - - - - - - - -
Fireplaces int64 1460 0 0.00000 4 0 0.00000 1.00000 1.00000 3 0.61301 1.00000 0.64467 0.64957
Foundation object 1460 0 0.00000 6 BrkTil - - - Wood - - - -
FullBath int64 1460 0 0.00000 4 0 1.00000 2.00000 2.00000 3 1.56507 2.00000 0.55092 0.03656
Functional object 1460 0 0.00000 7 Maj1 - - - Typ - - - -
GarageArea int64 1460 0 0.00000 441 0 334.50000 480.00000 576.00000 1418 472.98014 480.00000 213.80484 0.17998
GarageCars int64 1460 0 0.00000 5 0 1.00000 2.00000 2.00000 4 1.76712 2.00000 0.74732 -0.34255
GarageCond object 1379 81 0.05500 5 - - - - - - - - -
GarageFinish object 1379 81 0.05500 3 - - - - - - - - -
GarageQual object 1379 81 0.05500 5 - - - - - - - - -
GarageType object 1379 81 0.05500 6 - - - - - - - - -
GarageYrBlt float64 1379 81 0.05500 97 1900.00000 1961.00000 1980.00000 2002.00000 2010.00000 1978.50616 1980.00000 24.68972 -0.64941
GrLivArea int64 1460 0 0.00000 861 334 1129.50000 1464.00000 1776.75000 5642 1515.46370 1464.00000 525.48038 1.36656
HalfBath int64 1460 0 0.00000 3 0 0.00000 0.00000 1.00000 2 0.38288 0.00000 0.50289 0.67590
Heating object 1460 0 0.00000 6 Floor - - - Wall - - - -
HeatingQC object 1460 0 0.00000 5 Ex - - - TA - - - -
HouseStyle object 1460 0 0.00000 8 1.5Fin - - - SLvl - - - -
Id int64 1460 0 0.00000 1460 1 365.75000 730.50000 1095.25000 1460 730.50000 730.50000 421.61001 0.00000
KitchenAbvGr int64 1460 0 0.00000 4 0 1.00000 1.00000 1.00000 3 1.04658 1.00000 0.22034 4.48840
KitchenQual object 1460 0 0.00000 4 Ex - - - TA - - - -
LandContour object 1460 0 0.00000 4 Bnk - - - Lvl - - - -
LandSlope object 1460 0 0.00000 3 Gtl - - - Sev - - - -
LotArea int64 1460 0 0.00000 1073 1300 7553.50000 9478.50000 11601.50000 215245 10516.82808 9478.50000 9981.26493 12.20769
LotConfig object 1460 0 0.00000 5 Corner - - - Inside - - - -
LotFrontage float64 1201 259 0.17700 110 21.00000 59.00000 69.00000 80.00000 313.00000 70.04996 69.00000 24.28475 2.16357
LotShape object 1460 0 0.00000 4 IR1 - - - Reg - - - -
LowQualFinSF int64 1460 0 0.00000 24 0 0.00000 0.00000 0.00000 572 5.84452 0.00000 48.62308 9.01134
MSSubClass int64 1460 0 0.00000 15 20 20.00000 50.00000 70.00000 190 56.89726 50.00000 42.30057 1.40766
MSZoning object 1460 0 0.00000 5 C (all) - - - RM - - - -
MasVnrArea float64 1452 8 0.00500 327 0.00000 0.00000 0.00000 166.00000 1600.00000 103.68526 0.00000 181.06621 2.66908
MasVnrType object 1452 8 0.00500 4 - - - - - - - - -
MiscFeature object 54 1406 0.96300 4 - - - - - - - - -
MiscVal int64 1460 0 0.00000 21 0 0.00000 0.00000 0.00000 15500 43.48904 0.00000 496.12302 24.47679
MoSold int64 1460 0 0.00000 12 1 5.00000 6.00000 8.00000 12 6.32192 6.00000 2.70363 0.21205
Neighborhood object 1460 0 0.00000 25 Blmngtn - - - Veenker - - - -
OpenPorchSF int64 1460 0 0.00000 202 0 0.00000 25.00000 68.00000 547 46.66027 25.00000 66.25603 2.36434
OverallCond int64 1460 0 0.00000 9 1 5.00000 5.00000 6.00000 9 5.57534 5.00000 1.11280 0.69307
OverallQual int64 1460 0 0.00000 10 1 5.00000 6.00000 7.00000 10 6.09932 6.00000 1.38300 0.21694
PavedDrive object 1460 0 0.00000 3 N - - - Y - - - -
PoolArea int64 1460 0 0.00000 8 0 0.00000 0.00000 0.00000 738 2.75890 0.00000 40.17731 14.82837
PoolQC object 7 1453 0.99500 3 - - - - - - - - -
RoofMatl object 1460 0 0.00000 8 ClyTile - - - WdShngl - - - -
RoofStyle object 1460 0 0.00000 6 Flat - - - Shed - - - -
SaleCondition object 1460 0 0.00000 6 Abnorml - - - Partial - - - -
SalePrice int64 1460 0 0.00000 663 34900 129975.00000 163000.00000 214000.00000 755000 180921.19589 163000.00000 79442.50288 1.88288
SaleType object 1460 0 0.00000 9 COD - - - WD - - - -
ScreenPorch int64 1460 0 0.00000 76 0 0.00000 0.00000 0.00000 480 15.06096 0.00000 55.75742 4.12221
Street object 1460 0 0.00000 2 Grvl - - - Pave - - - -
TotRmsAbvGrd int64 1460 0 0.00000 12 2 5.00000 6.00000 7.00000 14 6.51781 6.00000 1.62539 0.67634
TotalBsmtSF int64 1460 0 0.00000 721 0 795.75000 991.50000 1298.25000 6110 1057.42945 991.50000 438.70532 1.52425
Utilities object 1460 0 0.00000 2 AllPub - - - NoSeWa - - - -
WoodDeckSF int64 1460 0 0.00000 274 0 0.00000 0.00000 168.00000 857 94.24452 0.00000 125.33879 1.54138
YearBuilt int64 1460 0 0.00000 112 1872 1954.00000 1973.00000 2000.00000 2010 1971.26781 1973.00000 30.20290 -0.61346
YearRemodAdd int64 1460 0 0.00000 61 1950 1967.00000 1994.00000 2004.00000 2010 1984.86575 1994.00000 20.64541 -0.50356
YrSold int64 1460 0 0.00000 5 2006 2007.00000 2008.00000 2009.00000 2010 2007.81575 2008.00000 1.32810 0.09627

Missing value analysis

In [13]:
df = pd.concat((housePrices, test_df))
indexs = df['Id']

df.drop(['SalePrice', 'Id'],axis = 1, inplace = True)
df.shape, housePrices.shape
Out[13]:
((2919, 79), (1460, 81))
In [14]:
df.head()
Out[14]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 60 RL 65.00000 8450 Pave NaN Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.00000 Gd TA PConc Gd TA No GLQ 706.00000 Unf 0.00000 150.00000 856.00000 GasA Ex Y SBrkr 856 854 0 1710 1.00000 0.00000 2 1 3 1 Gd 8 Typ 0 NaN Attchd 2003.00000 RFn 2.00000 548.00000 TA TA Y 0 61 0 0 0 0 NaN NaN NaN 0 2 2008 WD Normal
1 20 RL 80.00000 9600 Pave NaN Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.00000 TA TA CBlock Gd TA Gd ALQ 978.00000 Unf 0.00000 284.00000 1262.00000 GasA Ex Y SBrkr 1262 0 0 1262 0.00000 1.00000 2 0 3 1 TA 6 Typ 1 TA Attchd 1976.00000 RFn 2.00000 460.00000 TA TA Y 298 0 0 0 0 0 NaN NaN NaN 0 5 2007 WD Normal
2 60 RL 68.00000 11250 Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.00000 Gd TA PConc Gd TA Mn GLQ 486.00000 Unf 0.00000 434.00000 920.00000 GasA Ex Y SBrkr 920 866 0 1786 1.00000 0.00000 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001.00000 RFn 2.00000 608.00000 TA TA Y 0 42 0 0 0 0 NaN NaN NaN 0 9 2008 WD Normal
3 70 RL 60.00000 9550 Pave NaN IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.00000 TA TA BrkTil TA Gd No ALQ 216.00000 Unf 0.00000 540.00000 756.00000 GasA Gd Y SBrkr 961 756 0 1717 1.00000 0.00000 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.00000 Unf 3.00000 642.00000 TA TA Y 0 35 272 0 0 0 NaN NaN NaN 0 2 2006 WD Abnorml
4 60 RL 84.00000 14260 Pave NaN IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.00000 Gd TA PConc Gd TA Av GLQ 655.00000 Unf 0.00000 490.00000 1145.00000 GasA Ex Y SBrkr 1145 1053 0 2198 1.00000 0.00000 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000.00000 RFn 3.00000 836.00000 TA TA Y 192 84 0 0 0 0 NaN NaN NaN 0 12 2008 WD Normal
In [15]:
percent_missing = round(df.isnull().sum() * 100 / len(df),3)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df[missing_value_df['percent_missing']>0].sort_values(by='percent_missing', ascending= False )
Out[15]:
column_name percent_missing
PoolQC PoolQC 99.65700
MiscFeature MiscFeature 96.40300
Alley Alley 93.21700
Fence Fence 80.43900
FireplaceQu FireplaceQu 48.64700
LotFrontage LotFrontage 16.65000
GarageFinish GarageFinish 5.44700
GarageQual GarageQual 5.44700
GarageCond GarageCond 5.44700
GarageYrBlt GarageYrBlt 5.44700
GarageType GarageType 5.37900
BsmtExposure BsmtExposure 2.80900
BsmtCond BsmtCond 2.80900
BsmtQual BsmtQual 2.77500
BsmtFinType2 BsmtFinType2 2.74100
BsmtFinType1 BsmtFinType1 2.70600
MasVnrType MasVnrType 0.82200
MasVnrArea MasVnrArea 0.78800
MSZoning MSZoning 0.13700
BsmtFullBath BsmtFullBath 0.06900
BsmtHalfBath BsmtHalfBath 0.06900
Functional Functional 0.06900
Utilities Utilities 0.06900
GarageArea GarageArea 0.03400
GarageCars GarageCars 0.03400
Electrical Electrical 0.03400
KitchenQual KitchenQual 0.03400
TotalBsmtSF TotalBsmtSF 0.03400
BsmtUnfSF BsmtUnfSF 0.03400
BsmtFinSF2 BsmtFinSF2 0.03400
BsmtFinSF1 BsmtFinSF1 0.03400
Exterior2nd Exterior2nd 0.03400
Exterior1st Exterior1st 0.03400
SaleType SaleType 0.03400

Let's analyse whether the feature has:

  • Missing value by random
  • Missing value by chance

From the metadata and data description, it looks like the houses without Pool, MiscFeatures, Alley access, Fence and Fireplace are are marked as missing. Hence replace those values with 'NotAvailable'

In [16]:
colToReplace = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
for i in colToReplace:
    df[i] = df[i].fillna('NotAvailable')
In [17]:
df['PoolQC'].value_counts()
Out[17]:
NotAvailable    2909
Ex                 4
Gd                 4
Fa                 2
Name: PoolQC, dtype: int64
In [18]:
round(df[colToReplace].isnull().sum() * 100 / len(df),3)
Out[18]:
PoolQC        0.00000
MiscFeature   0.00000
Alley         0.00000
Fence         0.00000
FireplaceQu   0.00000
dtype: float64

Garage

Houses with no Garage are missing. Hence replacing missing with 'NoGarage'

In [19]:
garageCols = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond','GarageYrBlt', 'GarageArea', 'GarageCars']
for i in garageCols[0:4]:
    df[i] = df[i].fillna('NoGarage')
In [20]:
round(df[garageCols].isnull().sum() * 100 / len(df),3)
Out[20]:
GarageType     0.00000
GarageFinish   0.00000
GarageQual     0.00000
GarageCond     0.00000
GarageYrBlt    5.44700
GarageArea     0.03400
GarageCars     0.03400
dtype: float64

Replacing 'GarageYrBlt', 'GarageArea', 'GarageCars' with 0, since there is no garage.

In [21]:
for i in garageCols[4:]:
    df[i] = df[i].fillna(0)

Basement

Houses with no basement are missing. Hence replacing missing with 'NoBasement'

In [22]:
basementCols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2']
round(df[basementCols].isnull().sum() * 100 / len(df),3)
Out[22]:
BsmtQual       2.77500
BsmtCond       2.80900
BsmtExposure   2.80900
BsmtFinType1   2.70600
BsmtFinType2   2.74100
dtype: float64
In [23]:
for i in basementCols:
    df[i] = df[i].fillna('NoBasement')
round(df[basementCols].isnull().sum() * 100 / len(df),3)
Out[23]:
BsmtQual       0.00000
BsmtCond       0.00000
BsmtExposure   0.00000
BsmtFinType1   0.00000
BsmtFinType2   0.00000
dtype: float64
In [24]:
for i in ['BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath','BsmtHalfBath']:
    df[i] = df[i].fillna(0)

Masonry veneer

In [25]:
df['MasVnrType'] = df['MasVnrType'].fillna('None')
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

Functional

In [26]:
df['Functional'] = df['Functional'].fillna('Typ')

Lot Frontage

In [27]:
df['Neighborhood'].value_counts()
Out[27]:
NAmes      443
CollgCr    267
OldTown    239
Edwards    194
Somerst    182
NridgHt    166
Gilbert    165
Sawyer     151
NWAmes     131
SawyerW    125
Mitchel    114
BrkSide    108
Crawfor    103
IDOTRR      93
Timber      72
NoRidge     71
StoneBr     51
SWISU       48
ClearCr     44
MeadowV     37
BrDale      30
Blmngtn     28
Veenker     24
NPkVill     23
Blueste     10
Name: Neighborhood, dtype: int64

Usually houses in same neighborhood look similar and therefore imputing missing values by median of lot frontage grouped by neighborhood

In [28]:
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
In [29]:
percent_missing = round(df.isnull().sum() * 100 / len(df),3)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df[missing_value_df['percent_missing']>0].sort_values(by='percent_missing', ascending= False )
Out[29]:
column_name percent_missing
MSZoning MSZoning 0.13700
Utilities Utilities 0.06900
Exterior1st Exterior1st 0.03400
Exterior2nd Exterior2nd 0.03400
Electrical Electrical 0.03400
KitchenQual KitchenQual 0.03400
SaleType SaleType 0.03400
In [30]:
cols = ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'SaleType', 'Electrical']
In [31]:
for i in cols:
    df[i] = df[i].fillna(df[i].mode()[0])
In [32]:
plt.figure(figsize=(7,5))
sns.heatmap(df.isnull(), cbar=True, cmap= 'YlGnBu')
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ba5ca038e0>

Feature Engineering

In [33]:
categorical_cols = df.dtypes[ df.dtypes == 'object' ].index.tolist()
numerical_cols = df.dtypes[ df.dtypes != 'object' ].index.tolist()

print("Total Number of Features:", df.shape[1])
print("Total Number of Categorical Features:", len(categorical_cols))
print("Total Number of Numerical Features:", len(numerical_cols))
Total Number of Features: 79
Total Number of Categorical Features: 43
Total Number of Numerical Features: 36
In [34]:
# Custom Encoding

MSSubclass_map = {'20':1,'30':2,'40':3,'45':4,'50':5,'60':6,'70':7,'75':8,'80':9,
                  '85':10, '90':11,'120':12,'150':13,'160':14,'180':15,'190':16}

LotShape = {'Reg':3,'IR1':2,'IR2':1,'IR3':0}

utilities_map = {'AllPub':4, 'NoSewr':3, 'NoSeWa':2, 'ELO':1}

quality_map = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1, 'NoBasement':0, 'NotAvailable':0, 'NoGarage':0}

basement_exp = {'NoBasement':0,'No':1,'Mn':2,'Av':3,'Gd':4}

bsmtfin = {'NoBasement':0,'Unf':1,'LwQ':2, 'Rec':3,'BLQ':4,'ALQ':5, 'GLQ':6}

centralAir = {'N':0, 'Y':1}

Functional_map = {'Sal':0,'Sev':1,'Maj2':2,'Maj1':3,'Mod':4,'Min2':5,'Min1':6,'Typ':7}

GarageFinish =  {'NoGarage':0,'Unf':1,'RFn':2,'Fin':3}

fence_qc = {'GdPrv':4, 'MnPrv':3, 'GdWo':2, 'MnWw':1, 'NotAvailable':0}
In [35]:
q = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
     'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'HeatingQC']

for i in q:
    df[i] = df[i].replace(quality_map)
In [36]:
df['MSSubClass'] = df['MSSubClass'].apply(str).replace(MSSubclass_map)

df['LotShape'] = df['LotShape'].replace(LotShape)
df['Utilities'] = df['Utilities'].replace(utilities_map)

df['BsmtExposure'] = df['BsmtExposure'].replace(basement_exp)
df['BsmtFinType1'] = df['BsmtFinType1'].replace(bsmtfin)
df['BsmtFinType2'] = df['BsmtFinType2'].replace(bsmtfin)


df['CentralAir'] = df['CentralAir'].replace(centralAir)
df['Functional'] = df['Functional'].replace(Functional_map)
df['GarageFinish'] = df['GarageFinish'].replace(GarageFinish)
df['Fence'] = df['Fence'].replace(fence_qc)
In [37]:
#Creating a feature -> If house is remodelled then 1 else 0
df['Remodel'] = np.where(df['YearRemodAdd']==df['YearBuilt'], 1, 0)
df.drop('YearRemodAdd', axis=1, inplace=True)
In [38]:
#Transforming variable to age
df['HouseAge'] = 2020 - df['YearBuilt']
df.drop('YearBuilt', axis=1, inplace=True)
In [39]:
#Transforming variable to age
df['GarageYrBlt'] = 2020 - df['GarageYrBlt']
df.drop('GarageYrBlt', axis=1, inplace=True)
In [40]:
df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath'])+df['BsmtFullBath']+(0.5 * df['BsmtHalfBath']))

df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +df['EnclosedPorch'] + 
                        df['ScreenPorch'] + df['WoodDeckSF'])
In [41]:
# Import library for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

def annote_graph(graph):
    for p in graph.patches:
        height = p.get_height()
        graph.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
In [42]:
temp_train = df.iloc[0:len(train_index)]
temp_train = pd.concat((temp_train, target), axis=1)
In [43]:
sns.pairplot(data=temp_train,
                  y_vars=['SalePrice'],
                  x_vars=['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath'])
Out[43]:
<seaborn.axisgrid.PairGrid at 0x1ba5ca825b0>
In [44]:
X = temp_train[['TotalBsmtSF','2ndFlrSF', 'BsmtFullBath', '1stFlrSF']]
calc_vif(X)
Out[44]:
variables VIF
0 TotalBsmtSF 21.52000
1 2ndFlrSF 1.42832
2 BsmtFullBath 1.86393
3 1stFlrSF 21.31870
In [45]:
sns.pairplot(data=temp_train,
                  y_vars=['SalePrice'],
                  x_vars=['Total_Bathrooms', 'FullBath', 'HalfBath', 'BsmtHalfBath'])
Out[45]:
<seaborn.axisgrid.PairGrid at 0x1ba5d27b190>
In [46]:
X = temp_train[['Total_Bathrooms', 'FullBath', 'HalfBath', 'BsmtHalfBath']]
calc_vif(X)
Out[46]:
variables VIF
0 Total_Bathrooms 18.67898
1 FullBath 15.92594
2 HalfBath 1.95346
3 BsmtHalfBath 1.05811
In [47]:
sns.pairplot(data=temp_train,
                  y_vars=['SalePrice'],
                  x_vars=['OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch', 'WoodDeckSF'])
Out[47]:
<seaborn.axisgrid.PairGrid at 0x1ba5e149b50>
In [48]:
X = temp_train[['OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch','WoodDeckSF']]
calc_vif(X)
Out[48]:
variables VIF
0 OpenPorchSF 1.22850
1 3SsnPorch 1.00451
2 EnclosedPorch 1.02019
3 ScreenPorch 1.04652
4 WoodDeckSF 1.17991
In [49]:
fig, ax =plt.subplots(2,2, figsize=(10,10))


graph = sns.countplot(temp_train['PoolQC'], ax=ax[0,0])
annote_graph(graph)

graph = sns.countplot(temp_train['Utilities'], ax=ax[0,1])
annote_graph(graph)

graph = sns.countplot(temp_train['Street'], ax=ax[1,0])
annote_graph(graph)

fig.show()
fig.delaxes(ax[1][1])
In [50]:
cols_to_remove = ['PoolQC', 'Utilities', 'Street', 'FullBath', '1stFlrSF']
In [51]:
df.drop(cols_to_remove, axis=1, inplace=True)
In [52]:
df.head()
Out[52]:
MSSubClass MSZoning LotFrontage LotArea Alley LotShape LandContour LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition Remodel HouseAge TotalSF Total_Bathrooms Total_porch_sf
0 6 RL 65.00000 8450 NotAvailable 3 Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 Gable CompShg VinylSd VinylSd BrkFace 196.00000 4 3 PConc 4 3 1 6 706.00000 1 0.00000 150.00000 856.00000 GasA 5 1 SBrkr 854 0 1710 1.00000 0.00000 1 3 1 4 8 7 0 0 Attchd 2 2.00000 548.00000 3 3 Y 0 61 0 0 0 0 0 NotAvailable 0 2 2008 WD Normal 1 17 2566.00000 3.50000 61
1 1 RL 80.00000 9600 NotAvailable 3 Lvl FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 Gable CompShg MetalSd MetalSd None 0.00000 3 3 CBlock 4 3 4 5 978.00000 1 0.00000 284.00000 1262.00000 GasA 5 1 SBrkr 0 0 1262 0.00000 1.00000 0 3 1 3 6 7 1 3 Attchd 2 2.00000 460.00000 3 3 Y 298 0 0 0 0 0 0 NotAvailable 0 5 2007 WD Normal 1 44 2524.00000 2.50000 298
2 6 RL 68.00000 11250 NotAvailable 2 Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 Gable CompShg VinylSd VinylSd BrkFace 162.00000 4 3 PConc 4 3 2 6 486.00000 1 0.00000 434.00000 920.00000 GasA 5 1 SBrkr 866 0 1786 1.00000 0.00000 1 3 1 4 6 7 1 3 Attchd 2 2.00000 608.00000 3 3 Y 0 42 0 0 0 0 0 NotAvailable 0 9 2008 WD Normal 0 19 2706.00000 3.50000 42
3 7 RL 60.00000 9550 NotAvailable 2 Lvl Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 Gable CompShg Wd Sdng Wd Shng None 0.00000 3 3 BrkTil 3 4 1 5 216.00000 1 0.00000 540.00000 756.00000 GasA 4 1 SBrkr 756 0 1717 1.00000 0.00000 0 3 1 4 7 7 1 4 Detchd 1 3.00000 642.00000 3 3 Y 0 35 272 0 0 0 0 NotAvailable 0 2 2006 WD Abnorml 0 105 2473.00000 2.00000 307
4 6 RL 84.00000 14260 NotAvailable 2 Lvl FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 Gable CompShg VinylSd VinylSd BrkFace 350.00000 4 3 PConc 4 3 3 6 655.00000 1 0.00000 490.00000 1145.00000 GasA 5 1 SBrkr 1053 0 2198 1.00000 0.00000 1 4 1 4 9 7 1 3 Attchd 2 3.00000 836.00000 3 3 Y 192 84 0 0 0 0 0 NotAvailable 0 12 2008 WD Normal 1 20 3343.00000 3.50000 276
In [53]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 76 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MSSubClass       2919 non-null   int64  
 1   MSZoning         2919 non-null   object 
 2   LotFrontage      2919 non-null   float64
 3   LotArea          2919 non-null   int64  
 4   Alley            2919 non-null   object 
 5   LotShape         2919 non-null   int64  
 6   LandContour      2919 non-null   object 
 7   LotConfig        2919 non-null   object 
 8   LandSlope        2919 non-null   object 
 9   Neighborhood     2919 non-null   object 
 10  Condition1       2919 non-null   object 
 11  Condition2       2919 non-null   object 
 12  BldgType         2919 non-null   object 
 13  HouseStyle       2919 non-null   object 
 14  OverallQual      2919 non-null   int64  
 15  OverallCond      2919 non-null   int64  
 16  RoofStyle        2919 non-null   object 
 17  RoofMatl         2919 non-null   object 
 18  Exterior1st      2919 non-null   object 
 19  Exterior2nd      2919 non-null   object 
 20  MasVnrType       2919 non-null   object 
 21  MasVnrArea       2919 non-null   float64
 22  ExterQual        2919 non-null   int64  
 23  ExterCond        2919 non-null   int64  
 24  Foundation       2919 non-null   object 
 25  BsmtQual         2919 non-null   int64  
 26  BsmtCond         2919 non-null   int64  
 27  BsmtExposure     2919 non-null   int64  
 28  BsmtFinType1     2919 non-null   int64  
 29  BsmtFinSF1       2919 non-null   float64
 30  BsmtFinType2     2919 non-null   int64  
 31  BsmtFinSF2       2919 non-null   float64
 32  BsmtUnfSF        2919 non-null   float64
 33  TotalBsmtSF      2919 non-null   float64
 34  Heating          2919 non-null   object 
 35  HeatingQC        2919 non-null   int64  
 36  CentralAir       2919 non-null   int64  
 37  Electrical       2919 non-null   object 
 38  2ndFlrSF         2919 non-null   int64  
 39  LowQualFinSF     2919 non-null   int64  
 40  GrLivArea        2919 non-null   int64  
 41  BsmtFullBath     2919 non-null   float64
 42  BsmtHalfBath     2919 non-null   float64
 43  HalfBath         2919 non-null   int64  
 44  BedroomAbvGr     2919 non-null   int64  
 45  KitchenAbvGr     2919 non-null   int64  
 46  KitchenQual      2919 non-null   int64  
 47  TotRmsAbvGrd     2919 non-null   int64  
 48  Functional       2919 non-null   int64  
 49  Fireplaces       2919 non-null   int64  
 50  FireplaceQu      2919 non-null   int64  
 51  GarageType       2919 non-null   object 
 52  GarageFinish     2919 non-null   int64  
 53  GarageCars       2919 non-null   float64
 54  GarageArea       2919 non-null   float64
 55  GarageQual       2919 non-null   int64  
 56  GarageCond       2919 non-null   int64  
 57  PavedDrive       2919 non-null   object 
 58  WoodDeckSF       2919 non-null   int64  
 59  OpenPorchSF      2919 non-null   int64  
 60  EnclosedPorch    2919 non-null   int64  
 61  3SsnPorch        2919 non-null   int64  
 62  ScreenPorch      2919 non-null   int64  
 63  PoolArea         2919 non-null   int64  
 64  Fence            2919 non-null   int64  
 65  MiscFeature      2919 non-null   object 
 66  MiscVal          2919 non-null   int64  
 67  MoSold           2919 non-null   int64  
 68  YrSold           2919 non-null   int64  
 69  SaleType         2919 non-null   object 
 70  SaleCondition    2919 non-null   object 
 71  Remodel          2919 non-null   int32  
 72  HouseAge         2919 non-null   int64  
 73  TotalSF          2919 non-null   float64
 74  Total_Bathrooms  2919 non-null   float64
 75  Total_porch_sf   2919 non-null   int64  
dtypes: float64(12), int32(1), int64(40), object(23)
memory usage: 1.7+ MB

Target variable

In [54]:
sns.distplot(target, color='blue')
Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ba5f597700>
In [55]:
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(target))
print(z)
[0.34727322 0.00728832 0.53615372 ... 1.07761115 0.48852299 0.42084081]
In [56]:
threshold = 3
outlier_index = np.where(z > 3)[0]
print("Number of outliers based on z-score:", len(outlier_index))
Number of outliers based on z-score: 22

The loss function for the regression model is mean_absolute_error which is somewhat robust to outliers compared to mean_squared error which penalizes even a small error which leads to over-estimation of how bad the model is

1. Feature Selection using Predictive Power Score

In [57]:
import ppscore as pps
In [58]:
temp_train = df.iloc[0:len(train_index)]
temp_train = pd.concat((temp_train, target), axis=1)
temp_train.head()
Out[58]:
MSSubClass MSZoning LotFrontage LotArea Alley LotShape LandContour LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition Remodel HouseAge TotalSF Total_Bathrooms Total_porch_sf SalePrice
0 6 RL 65.00000 8450 NotAvailable 3 Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 Gable CompShg VinylSd VinylSd BrkFace 196.00000 4 3 PConc 4 3 1 6 706.00000 1 0.00000 150.00000 856.00000 GasA 5 1 SBrkr 854 0 1710 1.00000 0.00000 1 3 1 4 8 7 0 0 Attchd 2 2.00000 548.00000 3 3 Y 0 61 0 0 0 0 0 NotAvailable 0 2 2008 WD Normal 1 17 2566.00000 3.50000 61 208500
1 1 RL 80.00000 9600 NotAvailable 3 Lvl FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 Gable CompShg MetalSd MetalSd None 0.00000 3 3 CBlock 4 3 4 5 978.00000 1 0.00000 284.00000 1262.00000 GasA 5 1 SBrkr 0 0 1262 0.00000 1.00000 0 3 1 3 6 7 1 3 Attchd 2 2.00000 460.00000 3 3 Y 298 0 0 0 0 0 0 NotAvailable 0 5 2007 WD Normal 1 44 2524.00000 2.50000 298 181500
2 6 RL 68.00000 11250 NotAvailable 2 Lvl Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 Gable CompShg VinylSd VinylSd BrkFace 162.00000 4 3 PConc 4 3 2 6 486.00000 1 0.00000 434.00000 920.00000 GasA 5 1 SBrkr 866 0 1786 1.00000 0.00000 1 3 1 4 6 7 1 3 Attchd 2 2.00000 608.00000 3 3 Y 0 42 0 0 0 0 0 NotAvailable 0 9 2008 WD Normal 0 19 2706.00000 3.50000 42 223500
3 7 RL 60.00000 9550 NotAvailable 2 Lvl Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 Gable CompShg Wd Sdng Wd Shng None 0.00000 3 3 BrkTil 3 4 1 5 216.00000 1 0.00000 540.00000 756.00000 GasA 4 1 SBrkr 756 0 1717 1.00000 0.00000 0 3 1 4 7 7 1 4 Detchd 1 3.00000 642.00000 3 3 Y 0 35 272 0 0 0 0 NotAvailable 0 2 2006 WD Abnorml 0 105 2473.00000 2.00000 307 140000
4 6 RL 84.00000 14260 NotAvailable 2 Lvl FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 Gable CompShg VinylSd VinylSd BrkFace 350.00000 4 3 PConc 4 3 3 6 655.00000 1 0.00000 490.00000 1145.00000 GasA 5 1 SBrkr 1053 0 2198 1.00000 0.00000 1 4 1 4 9 7 1 3 Attchd 2 3.00000 836.00000 3 3 Y 192 84 0 0 0 0 0 NotAvailable 0 12 2008 WD Normal 1 20 3343.00000 3.50000 276 250000
In [59]:
col = temp_train.columns#.score will be a column in the matrix below
train_pps = temp_train[col]
pps_mat = pps.matrix(train_pps)
In [60]:
pps_mat[pps_mat['x'] == 'SalePrice'].sort_values(by='ppscore', ascending=False)
Out[60]:
x y ppscore case is_valid_score metric baseline_score model_score model
5928 SalePrice SalePrice 1.00000 predict_itself True None 0.00000 1.00000 None
5925 SalePrice TotalSF 0.31493 regression True mean absolute error 609.30205 417.41354 DecisionTreeRegressor()
5866 SalePrice OverallQual 0.31044 regression True mean absolute error 1.07329 0.74010 DecisionTreeRegressor()
5898 SalePrice KitchenQual 0.30749 regression True mean absolute error 0.56507 0.39132 DecisionTreeRegressor()
5902 SalePrice FireplaceQu 0.26544 regression True mean absolute error 1.74315 1.28045 DecisionTreeRegressor()
5874 SalePrice ExterQual 0.26195 regression True mean absolute error 0.41507 0.30634 DecisionTreeRegressor()
5876 SalePrice Foundation 0.26164 classification True weighted F1 0.41301 0.56659 DecisionTreeClassifier()
5877 SalePrice BsmtQual 0.23855 regression True mean absolute error 0.67671 0.51528 DecisionTreeRegressor()
5903 SalePrice GarageType 0.18883 classification True weighted F1 0.44500 0.54980 DecisionTreeClassifier()
5904 SalePrice GarageFinish 0.18736 regression True mean absolute error 0.76644 0.62284 DecisionTreeRegressor()
5892 SalePrice GrLivArea 0.16492 regression True mean absolute error 394.53219 329.46657 DecisionTreeRegressor()
5901 SalePrice Fireplaces 0.16339 regression True mean absolute error 0.55822 0.46701 DecisionTreeRegressor()
5872 SalePrice MasVnrType 0.14380 classification True weighted F1 0.45342 0.53202 DecisionTreeClassifier()
5926 SalePrice Total_Bathrooms 0.14249 regression True mean absolute error 0.61130 0.52420 DecisionTreeRegressor()
5924 SalePrice HouseAge 0.14096 regression True mean absolute error 25.04178 21.51190 DecisionTreeRegressor()
5905 SalePrice GarageCars 0.13477 regression True mean absolute error 0.49452 0.42787 DecisionTreeRegressor()
5887 SalePrice HeatingQC 0.11121 regression True mean absolute error 0.85479 0.75973 DecisionTreeRegressor()
5906 SalePrice GarageArea 0.09766 regression True mean absolute error 159.93356 144.31482 DecisionTreeRegressor()
5861 SalePrice Neighborhood 0.08902 classification True weighted F1 0.07397 0.15641 DecisionTreeClassifier()
5868 SalePrice RoofStyle 0.07577 classification True weighted F1 0.68566 0.70948 DecisionTreeClassifier()
5871 SalePrice Exterior2nd 0.06711 classification True weighted F1 0.19315 0.24730 DecisionTreeClassifier()
5870 SalePrice Exterior1st 0.06023 classification True weighted F1 0.20000 0.24818 DecisionTreeClassifier()
5865 SalePrice HouseStyle 0.03862 classification True weighted F1 0.36712 0.39156 DecisionTreeClassifier()
5921 SalePrice SaleType 0.02848 classification True weighted F1 0.80639 0.81190 DecisionTreeClassifier()
5885 SalePrice TotalBsmtSF 0.02282 regression True mean absolute error 317.03219 309.79728 DecisionTreeRegressor()
5853 SalePrice MSZoning 0.00507 classification True weighted F1 0.69506 0.69660 DecisionTreeClassifier()
5922 SalePrice SaleCondition 0.00110 classification True weighted F1 0.73967 0.73995 DecisionTreeClassifier()
5908 SalePrice GarageCond 0.00000 regression True mean absolute error 0.20890 0.31444 DecisionTreeRegressor()
5927 SalePrice Total_porch_sf 0.00000 regression True mean absolute error 121.62808 132.61377 DecisionTreeRegressor()
5900 SalePrice Functional 0.00000 regression True mean absolute error 0.15822 0.30011 DecisionTreeRegressor()
5907 SalePrice GarageQual 0.00000 regression True mean absolute error 0.21712 0.33203 DecisionTreeRegressor()
5923 SalePrice Remodel 0.00000 regression True mean absolute error 0.47671 0.47878 DecisionTreeRegressor()
5909 SalePrice PavedDrive 0.00000 classification True weighted F1 0.87847 0.86894 DecisionTreeClassifier()
5910 SalePrice WoodDeckSF 0.00000 regression True mean absolute error 94.24452 105.92018 DecisionTreeRegressor()
5911 SalePrice OpenPorchSF 0.00000 regression True mean absolute error 44.67671 49.85653 DecisionTreeRegressor()
5912 SalePrice EnclosedPorch 0.00000 regression True mean absolute error 21.95411 36.50374 DecisionTreeRegressor()
5913 SalePrice 3SsnPorch 0.00000 regression True mean absolute error 3.40959 5.98871 DecisionTreeRegressor()
5914 SalePrice ScreenPorch 0.00000 regression True mean absolute error 15.06096 26.62788 DecisionTreeRegressor()
5915 SalePrice PoolArea 0.00000 regression True mean absolute error 2.75890 5.53156 DecisionTreeRegressor()
5916 SalePrice Fence 0.00000 regression True mean absolute error 0.56575 0.89582 DecisionTreeRegressor()
5917 SalePrice MiscFeature 0.00000 classification True weighted F1 0.94487 0.93726 DecisionTreeClassifier()
5918 SalePrice MiscVal 0.00000 regression True mean absolute error 43.48904 88.43827 DecisionTreeRegressor()
5919 SalePrice MoSold 0.00000 regression True mean absolute error 2.10548 2.63743 DecisionTreeRegressor()
5920 SalePrice YrSold 0.00000 regression True mean absolute error 1.12671 1.32359 DecisionTreeRegressor()
5899 SalePrice TotRmsAbvGrd 0.00000 regression True mean absolute error 1.23562 1.29455 DecisionTreeRegressor()
5852 SalePrice MSSubClass 0.00000 regression True mean absolute error 3.48493 4.04855 DecisionTreeRegressor()
5897 SalePrice KitchenAbvGr 0.00000 regression True mean absolute error 0.04795 0.08431 DecisionTreeRegressor()
5875 SalePrice ExterCond 0.00000 regression True mean absolute error 0.12466 0.22555 DecisionTreeRegressor()
5854 SalePrice LotFrontage 0.00000 regression True mean absolute error 15.00240 17.47085 DecisionTreeRegressor()
5855 SalePrice LotArea 0.00000 regression True mean absolute error 3604.20342 4773.84433 DecisionTreeRegressor()
5856 SalePrice Alley 0.00000 classification True weighted F1 0.90751 0.88574 DecisionTreeClassifier()
5857 SalePrice LotShape 0.00000 regression True mean absolute error 0.40822 0.47317 DecisionTreeRegressor()
5858 SalePrice LandContour 0.00000 classification True weighted F1 0.84966 0.83141 DecisionTreeClassifier()
5859 SalePrice LotConfig 0.00000 classification True weighted F1 0.60352 0.55547 DecisionTreeClassifier()
5860 SalePrice LandSlope 0.00000 classification True weighted F1 0.92060 0.90769 DecisionTreeClassifier()
5862 SalePrice Condition1 0.00000 classification True weighted F1 0.79956 0.77682 DecisionTreeClassifier()
5863 SalePrice Condition2 0.00000 classification True weighted F1 0.98462 0.98324 DecisionTreeClassifier()
5864 SalePrice BldgType 0.00000 classification True weighted F1 0.76079 0.74612 DecisionTreeClassifier()
5867 SalePrice OverallCond 0.00000 regression True mean absolute error 0.74795 0.96035 DecisionTreeRegressor()
5869 SalePrice RoofMatl 0.00000 classification True weighted F1 0.97337 0.96643 DecisionTreeClassifier()
5873 SalePrice MasVnrArea 0.00000 regression True mean absolute error 103.11712 123.30147 DecisionTreeRegressor()
5878 SalePrice BsmtCond 0.00000 regression True mean absolute error 0.15411 0.25949 DecisionTreeRegressor()
5896 SalePrice BedroomAbvGr 0.00000 regression True mean absolute error 0.51849 0.69886 DecisionTreeRegressor()
5879 SalePrice BsmtExposure 0.00000 regression True mean absolute error 0.68219 0.87881 DecisionTreeRegressor()
5880 SalePrice BsmtFinType1 0.00000 regression True mean absolute error 1.90068 1.92157 DecisionTreeRegressor()
5881 SalePrice BsmtFinSF1 0.00000 regression True mean absolute error 364.53562 377.46132 DecisionTreeRegressor()
5882 SalePrice BsmtFinType2 0.00000 regression True mean absolute error 0.29932 0.53344 DecisionTreeRegressor()
5883 SalePrice BsmtFinSF2 0.00000 regression True mean absolute error 46.54932 86.84909 DecisionTreeRegressor()
5884 SalePrice BsmtUnfSF 0.00000 regression True mean absolute error 347.25822 418.67519 DecisionTreeRegressor()
5886 SalePrice Heating 0.00000 classification True weighted F1 0.96724 0.96415 DecisionTreeClassifier()
5888 SalePrice CentralAir 0.00000 regression True mean absolute error 0.06507 0.08882 DecisionTreeRegressor()
5889 SalePrice Electrical 0.00000 classification True weighted F1 0.87349 0.86844 DecisionTreeClassifier()
5891 SalePrice LowQualFinSF 0.00000 regression True mean absolute error 5.84452 12.33944 DecisionTreeRegressor()
5893 SalePrice BsmtFullBath 0.00000 regression True mean absolute error 0.42534 0.47453 DecisionTreeRegressor()
5894 SalePrice BsmtHalfBath 0.00000 regression True mean absolute error 0.05753 0.10067 DecisionTreeRegressor()
5895 SalePrice HalfBath 0.00000 regression True mean absolute error 0.38288 0.41423 DecisionTreeRegressor()
5890 SalePrice 2ndFlrSF 0.00000 regression True mean absolute error 346.99247 396.72546 DecisionTreeRegressor()
In [61]:
# Selecting features only with ppscore > 0
pps_features = pps_mat.loc[(pps_mat['x'] == 'SalePrice') & (pps_mat['ppscore'] > 0)]['y'].values
In [62]:
print("Number of features using PPS: ",len(pps_features)), pps_features
Number of features using PPS:  27
Out[62]:
(None,
 array(['MSZoning', 'Neighborhood', 'HouseStyle', 'OverallQual',
        'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'ExterQual', 'Foundation', 'BsmtQual', 'TotalBsmtSF', 'HeatingQC',
        'GrLivArea', 'KitchenQual', 'Fireplaces', 'FireplaceQu',
        'GarageType', 'GarageFinish', 'GarageCars', 'GarageArea',
        'SaleType', 'SaleCondition', 'HouseAge', 'TotalSF',
        'Total_Bathrooms', 'SalePrice'], dtype=object))

Recreating training and testing datasets

In [63]:
def create_train_test(df, features, trainIndex):
    df = df[features[:-1]]
    
    categorical_cols = df.dtypes[ df.dtypes == 'object' ].index.tolist()
    numerical_cols = df.dtypes[ df.dtypes != 'object' ].index.tolist()
    
    cat_dummies = pd.get_dummies(df[categorical_cols], drop_first=True)
    
    df = df.drop(categorical_cols, axis=1)
    df = pd.concat([df, cat_dummies], axis=1)
    
    train = df.iloc[0:len(trainIndex)]
    test = df.iloc[len(trainIndex):]
    print('Shape of Train Dataset:',train.shape)
    print('Shape of Test Dataset:',test.shape)
    
    return(train,test)
In [64]:
# Train and Test data with all the features
train_all, test_all = create_train_test(df, df.columns, train_index)
Shape of Train Dataset: (1460, 200)
Shape of Test Dataset: (1459, 200)
In [65]:
# Train and Test data with features selected by predictive power score
train_pps, test_pps = create_train_test(df, pps_features, train_index)
Shape of Train Dataset: (1460, 111)
Shape of Test Dataset: (1459, 111)
In [66]:
from sklearn.preprocessing import StandardScaler

def scaler_function(train_data, target):
    scaler = StandardScaler()
    scaler.fit(train_data)
    train_std = scaler.transform(train_data)
    train = pd.DataFrame(train_std, columns=train_data.columns)
    train = pd.concat((train,target),axis=1)
    
    return scaler, train
In [67]:
scaler_all, train_scaled_all = scaler_function(train_all, target)

scaler_pps, train_scaled_pps = scaler_function(train_pps, target)
In [68]:
train_scaled_pps.shape, train_scaled_all.shape
Out[68]:
((1460, 112), (1460, 201))

Hyperopt

In [69]:
from hpsklearn import HyperoptEstimator
from hpsklearn import xgboost_regression, gradient_boosting_regression, extra_trees_regression, ada_boost_regression
from hyperopt import tpe
WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely
In [70]:
model_list = [xgboost_regression, gradient_boosting_regression, extra_trees_regression, ada_boost_regression]
model_names = ['xgBoost', 'gradBoost', 'extraTree', 'adaBoost']
In [71]:
from sklearn import metrics
def model_eval(model, y_true, y_pred):
    mae = metrics.mean_absolute_error(y_true.values, y_pred)
    mse = metrics.mean_squared_error(y_true.values, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_true.values, y_pred))
    r2score = metrics.r2_score(y_true.values, y_pred)
    
    scores = [mse,rmse, mae,r2score]
    return([round(x,5) for x in scores])
In [72]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pickle
import os
from pathlib import Path

def modeling(df, target_col, model_list, model_names, test_size=0.3, folder='model_dir', save_models=False):
    
    folder = 'Models/'+folder
    p = Path(folder)
    p.mkdir(parents=True, exist_ok=True)
    
    # evaluating scores -> r2, rmse, mae 
    train_scores = []
    valid_scores = []
    
    features = df.columns.tolist()
    features.remove(target_col)
    
    X = df[features]  
    y = df[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, test_size=test_size)
    
    
    for algo, name in zip(model_list,model_names):
        
        model = HyperoptEstimator(regressor=algo(name),
                                  preprocessing=[],
                                  max_evals=N_ITER,
                                  loss_fn=mean_absolute_error,
                                  #algo=tpe.suggest,
                                  trial_timeout=60, 
                                  seed=RANDOM_STATE)
        
        model.fit(X_train.values, y_train.values)
        
        #--------- Train Data Evaluation ---------
        predictions = model.predict(X_train.values)
        scores = model_eval(model= model, y_true= y_train, y_pred= predictions)
        train_scores.append(scores)
    
        #--------- Test Data Evaluation ---------
        predictions = model.predict(X_test.values)
        scores = model_eval(model= model, y_true= y_test, y_pred= predictions)
        valid_scores.append(scores)
        
        if save_models:      
            #print(model.best_model())
            pkl_filename = name+'.pkl'
        
            with open(p/pkl_filename, 'wb') as file:
                pickle.dump(model, file)

            #joblib.dump(model, filename)
    
    train_score_df = pd.DataFrame(train_scores, columns=['mse','rmse', 'mae','r2'], index= model_names)
    valid_score_df = pd.DataFrame(valid_scores, columns=['mse','rmse', 'mae','r2'], index= model_names)
        
    
    return (train_score_df, valid_score_df)
In [74]:
RANDOM_STATE = 42
N_ITER = 40
In [75]:
tr_df, val_df = modeling(df = train_scaled_all, 
                         target_col = target_col, 
                         model_list = model_list, 
                         model_names = model_names, 
                         folder = 'fullData'
                        )
100%|██████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.35s/trial, best loss: 16264.682850609755]
100%|███████████████████████████████████████████████████| 2/2 [00:12<00:00,  6.35s/trial, best loss: 13948.03250762195]
100%|██████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.58s/trial, best loss: 13066.706288109755]
100%|██████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.62trial/s, best loss: 13066.706288109755]
100%|██████████████████████████████████████████████████| 5/5 [00:07<00:00,  1.48s/trial, best loss: 13066.706288109755]
100%|██████████████████████████████████████████████████| 6/6 [00:04<00:00,  1.26trial/s, best loss: 13066.706288109755]
100%|██████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.73trial/s, best loss: 13066.706288109755]
100%|██████████████████████████████████████████████████| 8/8 [00:06<00:00,  1.30trial/s, best loss: 13066.706288109755]
100%|██████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.25trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.12trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 11/11 [00:03<00:00,  3.41trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 12/12 [00:04<00:00,  2.86trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 13/13 [00:05<00:00,  2.38trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 14/14 [00:06<00:00,  2.07trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 15/15 [00:07<00:00,  1.99trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 16/16 [00:03<00:00,  4.74trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 17/17 [00:05<00:00,  2.96trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 18/18 [00:04<00:00,  3.71trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 19/19 [00:05<00:00,  3.18trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.07trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 21/21 [00:06<00:00,  3.31trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 22/22 [00:02<00:00,  9.37trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 23/23 [00:02<00:00,  8.47trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 24/24 [00:05<00:00,  4.04trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 25/25 [00:05<00:00,  4.36trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 26/26 [00:06<00:00,  4.23trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 27/27 [00:02<00:00, 10.56trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 28/28 [00:06<00:00,  4.25trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 29/29 [00:03<00:00,  7.52trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 30/30 [00:06<00:00,  4.98trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 31/31 [00:05<00:00,  5.21trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 32/32 [00:05<00:00,  5.73trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 33/33 [00:02<00:00, 13.25trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 34/34 [00:02<00:00, 13.59trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 35/35 [00:03<00:00,  8.85trial/s, best loss: 13066.706288109755]
100%|████████████████████████████████████████████████| 36/36 [00:02<00:00, 12.84trial/s, best loss: 12846.019397865854]
100%|████████████████████████████████████████████████| 37/37 [00:06<00:00,  6.13trial/s, best loss: 12846.019397865854]
100%|████████████████████████████████████████████████| 38/38 [00:03<00:00, 12.09trial/s, best loss: 12846.019397865854]
100%|████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.37trial/s, best loss: 12846.019397865854]
100%|████████████████████████████████████████████████| 40/40 [00:04<00:00,  8.43trial/s, best loss: 12846.019397865854]
[01:22:54] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
100%|███████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.85s/trial, best loss: 51409.30316549314]
100%|███████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.02trial/s, best loss: 49089.65763864067]
100%|███████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.41trial/s, best loss: 38671.15129565798]
100%|███████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.24trial/s, best loss: 38671.15129565798]
100%|██████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.10trial/s, best loss: 32694.231075996042]
100%|██████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.04trial/s, best loss: 24108.356194179316]
100%|██████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.52trial/s, best loss: 24108.356194179316]
100%|██████████████████████████████████████████████████| 8/8 [00:02<00:00,  3.84trial/s, best loss: 15804.974690808967]
100%|██████████████████████████████████████████████████| 9/9 [00:03<00:00,  2.86trial/s, best loss: 15804.974690808967]
100%|████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.49trial/s, best loss: 15275.939275696883]
100%|████████████████████████████████████████████████| 11/11 [00:02<00:00,  5.35trial/s, best loss: 13780.134241852733]
100%|████████████████████████████████████████████████| 12/12 [00:01<00:00,  6.73trial/s, best loss: 13780.134241852733]
100%|████████████████████████████████████████████████| 13/13 [00:05<00:00,  2.29trial/s, best loss: 13780.134241852733]
100%|████████████████████████████████████████████████| 14/14 [00:02<00:00,  5.88trial/s, best loss: 13780.134241852733]
100%|████████████████████████████████████████████████| 15/15 [00:06<00:00,  2.44trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 16/16 [00:04<00:00,  3.38trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 17/17 [00:02<00:00,  8.01trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 18/18 [00:01<00:00,  9.54trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 19/19 [00:03<00:00,  5.32trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 20/20 [00:02<00:00,  9.70trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 21/21 [00:02<00:00,  8.90trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 22/22 [00:01<00:00, 11.45trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 23/23 [00:10<00:00,  2.28trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 24/24 [00:05<00:00,  4.51trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 25/25 [00:02<00:00,  9.48trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 26/26 [00:01<00:00, 13.56trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 27/27 [00:01<00:00, 14.03trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 28/28 [00:02<00:00, 11.30trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 29/29 [00:03<00:00,  7.71trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 30/30 [00:01<00:00, 15.30trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 31/31 [00:02<00:00, 14.83trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 32/32 [00:01<00:00, 17.10trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 33/33 [00:15<00:00,  2.16trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 34/34 [00:02<00:00, 14.91trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 35/35 [00:02<00:00, 16.83trial/s, best loss: 13609.715156123351]
100%|████████████████████████████████████████████████| 36/36 [00:03<00:00, 10.43trial/s, best loss: 13084.204177781361]
100%|████████████████████████████████████████████████| 37/37 [00:01<00:00, 19.64trial/s, best loss: 13084.204177781361]
100%|████████████████████████████████████████████████| 38/38 [00:44<00:00,  1.17s/trial, best loss: 13084.204177781361]
100%|████████████████████████████████████████████████| 39/39 [00:02<00:00, 15.04trial/s, best loss: 13084.204177781361]
100%|████████████████████████████████████████████████| 40/40 [00:01<00:00, 21.68trial/s, best loss: 13084.204177781361]
100%|██████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.94s/trial, best loss: 15730.658314855877]
100%|███████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.71s/trial, best loss: 14993.90049868267]
100%|███████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.41trial/s, best loss: 14993.90049868267]
100%|███████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.28trial/s, best loss: 14993.90049868267]
100%|███████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.19trial/s, best loss: 14993.90049868267]
100%|███████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.19trial/s, best loss: 14993.90049868267]
100%|██████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.05trial/s, best loss: 14654.874456422076]
100%|██████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.06trial/s, best loss: 14654.874456422076]
100%|██████████████████████████████████████████████████| 9/9 [00:01<00:00,  4.79trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.79trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 11/11 [00:02<00:00,  3.77trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 12/12 [00:02<00:00,  5.43trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 13/13 [00:02<00:00,  6.07trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 14/14 [00:02<00:00,  5.52trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 15/15 [00:01<00:00,  8.00trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 16/16 [00:01<00:00,  8.65trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 17/17 [00:01<00:00,  9.10trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 18/18 [00:01<00:00,  9.76trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 19/19 [00:02<00:00,  6.78trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 20/20 [00:02<00:00,  9.76trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 21/21 [00:02<00:00, 10.47trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 22/22 [00:02<00:00,  8.20trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 23/23 [00:02<00:00, 10.97trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 24/24 [00:04<00:00,  4.92trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 25/25 [00:01<00:00, 12.91trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 26/26 [00:02<00:00, 11.14trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 27/27 [00:02<00:00, 10.17trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 28/28 [00:01<00:00, 15.10trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 29/29 [00:01<00:00, 15.45trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 30/30 [00:02<00:00, 13.24trial/s, best loss: 14654.874456422076]
100%|████████████████████████████████████████████████| 31/31 [00:03<00:00,  9.34trial/s, best loss: 14484.459041211103]
100%|████████████████████████████████████████████████| 32/32 [00:07<00:00,  4.42trial/s, best loss: 14484.459041211103]
100%|████████████████████████████████████████████████| 33/33 [00:02<00:00, 12.36trial/s, best loss: 14484.459041211103]
100%|████████████████████████████████████████████████| 34/34 [00:02<00:00, 15.79trial/s, best loss: 14484.459041211103]
100%|████████████████████████████████████████████████| 35/35 [00:02<00:00, 17.20trial/s, best loss: 14484.459041211103]
100%|████████████████████████████████████████████████| 36/36 [00:02<00:00, 12.77trial/s, best loss: 14484.459041211103]
100%|████████████████████████████████████████████████| 37/37 [00:01<00:00, 20.09trial/s, best loss: 14484.459041211103]
100%|████████████████████████████████████████████████| 38/38 [00:04<00:00,  8.88trial/s, best loss: 14484.459041211103]
100%|████████████████████████████████████████████████| 39/39 [00:04<00:00,  7.92trial/s, best loss: 14484.459041211103]
100%|████████████████████████████████████████████████| 40/40 [00:03<00:00, 10.29trial/s, best loss: 14484.459041211103]
100%|███████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.92s/trial, best loss: 21548.01916079879]
100%|██████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.31s/trial, best loss: 20696.942900493395]
100%|██████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.16trial/s, best loss: 20696.942900493395]
100%|██████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.07trial/s, best loss: 20696.942900493395]
100%|██████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.88trial/s, best loss: 19442.710401863467]
100%|██████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.74trial/s, best loss: 19442.710401863467]
100%|██████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.56trial/s, best loss: 19442.710401863467]
100%|██████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.24trial/s, best loss: 19442.710401863467]
100%|██████████████████████████████████████████████████| 9/9 [00:01<00:00,  4.93trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.29trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.38trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 12/12 [00:02<00:00,  5.85trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 13/13 [00:02<00:00,  5.86trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 14/14 [00:03<00:00,  3.56trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 15/15 [00:01<00:00,  7.61trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 16/16 [00:01<00:00,  8.45trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 17/17 [00:01<00:00,  9.50trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 18/18 [00:02<00:00,  8.24trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 19/19 [00:02<00:00,  6.40trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 20/20 [00:02<00:00,  7.37trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 21/21 [00:01<00:00, 11.69trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 22/22 [00:02<00:00,  7.96trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 23/23 [00:02<00:00,  9.21trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 24/24 [00:03<00:00,  7.27trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 25/25 [00:01<00:00, 13.09trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 26/26 [00:03<00:00,  6.99trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 27/27 [00:02<00:00, 11.01trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 28/28 [00:01<00:00, 14.31trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 29/29 [00:02<00:00, 13.82trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 30/30 [00:02<00:00, 10.08trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 31/31 [00:02<00:00, 13.76trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 32/32 [00:03<00:00, 10.17trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 33/33 [00:02<00:00, 12.50trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 34/34 [00:02<00:00, 14.21trial/s, best loss: 19442.710401863467]
100%|████████████████████████████████████████████████| 35/35 [00:02<00:00, 13.18trial/s, best loss: 19245.192561016705]
100%|████████████████████████████████████████████████| 36/36 [00:02<00:00, 16.84trial/s, best loss: 19245.192561016705]
100%|████████████████████████████████████████████████| 37/37 [00:01<00:00, 19.59trial/s, best loss: 19245.192561016705]
100%|████████████████████████████████████████████████| 38/38 [00:05<00:00,  7.46trial/s, best loss: 19245.192561016705]
100%|████████████████████████████████████████████████| 39/39 [00:02<00:00, 13.22trial/s, best loss: 19245.192561016705]
100%|████████████████████████████████████████████████| 40/40 [00:03<00:00, 10.43trial/s, best loss: 19245.192561016705]
In [76]:
tr_df
Out[76]:
mse rmse mae r2
xgBoost 237100587.85477 15398.07091 10923.53262 0.96061
gradBoost 398592642.99777 19964.78507 9559.88001 0.93377
extraTree 0.00000 0.00000 0.00000 1.00000
adaBoost 820986274.69110 28652.85805 21097.87749 0.86359
In [77]:
val_df
Out[77]:
mse rmse mae r2
xgBoost 583323942.86768 24152.10018 15670.37807 0.91641
gradBoost 778972127.31241 27910.07215 14940.65892 0.88837
extraTree 790052396.58332 28107.87072 17203.49769 0.88678
adaBoost 1053239768.42781 32453.65570 21959.73341 0.84906
In [78]:
RANDOM_STATE = 42
N_ITER = 40

tr_df_pps, val_df_pps = modeling(df=train_scaled_pps, target_col=target_col, 
                                         model_list=model_list, model_names=model_names, folder='pps')
100%|███████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.15s/trial, best loss: 20697.60508765244]
100%|██████████████████████████████████████████████████| 2/2 [00:12<00:00,  6.09s/trial, best loss: 15912.847103658536]
100%|██████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.34s/trial, best loss: 15912.847103658536]
100%|██████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.67trial/s, best loss: 15912.847103658536]
100%|██████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.33s/trial, best loss: 15912.847103658536]
100%|██████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.55trial/s, best loss: 15912.847103658536]
100%|██████████████████████████████████████████████████| 7/7 [00:03<00:00,  1.93trial/s, best loss: 15912.847103658536]
100%|██████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.52trial/s, best loss: 15912.847103658536]
100%|██████████████████████████████████████████████████| 9/9 [00:06<00:00,  1.39trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.57trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 11/11 [00:03<00:00,  3.57trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 12/12 [00:03<00:00,  3.19trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 13/13 [00:04<00:00,  2.76trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 14/14 [00:06<00:00,  2.30trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 15/15 [00:06<00:00,  2.30trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 16/16 [00:03<00:00,  5.03trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 17/17 [00:05<00:00,  3.29trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 18/18 [00:03<00:00,  4.50trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 19/19 [00:05<00:00,  3.65trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.99trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 21/21 [00:05<00:00,  3.95trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 22/22 [00:02<00:00,  9.69trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 23/23 [00:02<00:00,  9.03trial/s, best loss: 15912.847103658536]
100%|████████████████████████████████████████████████| 24/24 [00:05<00:00,  4.36trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 25/25 [00:05<00:00,  4.89trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 26/26 [00:05<00:00,  4.60trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 27/27 [00:02<00:00, 10.93trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 28/28 [00:05<00:00,  5.13trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 29/29 [00:03<00:00,  8.28trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 30/30 [00:05<00:00,  5.90trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.36trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 32/32 [00:04<00:00,  6.70trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 33/33 [00:02<00:00, 14.60trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 34/34 [00:02<00:00, 13.67trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 35/35 [00:03<00:00, 10.38trial/s, best loss: 15690.747808689024]
100%|████████████████████████████████████████████████| 36/36 [00:02<00:00, 13.52trial/s, best loss: 15649.277858231708]
100%|████████████████████████████████████████████████| 37/37 [00:05<00:00,  7.40trial/s, best loss: 15649.277858231708]
100%|████████████████████████████████████████████████| 38/38 [00:02<00:00, 13.83trial/s, best loss: 15649.277858231708]
100%|████████████████████████████████████████████████| 39/39 [00:03<00:00, 12.01trial/s, best loss: 15649.277858231708]
100%|████████████████████████████████████████████████| 40/40 [00:03<00:00, 10.47trial/s, best loss: 15649.277858231708]
[01:32:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
100%|███████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.77s/trial, best loss: 51388.57616103797]
100%|███████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.02s/trial, best loss: 49081.13029526752]
100%|███████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.51trial/s, best loss: 38742.61484906596]
100%|███████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.19trial/s, best loss: 38742.61484906596]
100%|██████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.39trial/s, best loss: 32659.517332934494]
100%|██████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.16trial/s, best loss: 23596.946771962015]
100%|██████████████████████████████████████████████████| 7/7 [00:03<00:00,  1.93trial/s, best loss: 23596.946771962015]
100%|██████████████████████████████████████████████████| 8/8 [00:02<00:00,  3.84trial/s, best loss: 18492.909443713994]
100%|██████████████████████████████████████████████████| 9/9 [00:01<00:00,  4.50trial/s, best loss: 18492.909443713994]
100%|████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.58trial/s, best loss: 18229.436359079868]
100%|█████████████████████████████████████████████████| 11/11 [00:02<00:00,  5.48trial/s, best loss: 17017.15039968081]
100%|█████████████████████████████████████████████████| 12/12 [00:01<00:00,  6.29trial/s, best loss: 17017.15039968081]
100%|█████████████████████████████████████████████████| 13/13 [00:05<00:00,  2.46trial/s, best loss: 17017.15039968081]
100%|█████████████████████████████████████████████████| 14/14 [00:02<00:00,  6.50trial/s, best loss: 17017.15039968081]
100%|████████████████████████████████████████████████| 15/15 [00:05<00:00,  2.90trial/s, best loss: 15808.597675079276]
100%|████████████████████████████████████████████████| 16/16 [00:03<00:00,  4.88trial/s, best loss: 15808.597675079276]
100%|████████████████████████████████████████████████| 17/17 [00:01<00:00,  8.51trial/s, best loss: 15808.597675079276]
100%|████████████████████████████████████████████████| 18/18 [00:01<00:00,  9.52trial/s, best loss: 15808.597675079276]
100%|█████████████████████████████████████████████████| 19/19 [00:02<00:00,  6.51trial/s, best loss: 15572.39508271994]
100%|█████████████████████████████████████████████████| 20/20 [00:01<00:00, 10.35trial/s, best loss: 15480.78772352107]
100%|█████████████████████████████████████████████████| 21/21 [00:02<00:00, 10.21trial/s, best loss: 15480.78772352107]
100%|█████████████████████████████████████████████████| 22/22 [00:01<00:00, 12.00trial/s, best loss: 15480.78772352107]
100%|█████████████████████████████████████████████████| 23/23 [00:05<00:00,  3.91trial/s, best loss: 15480.78772352107]
100%|████████████████████████████████████████████████| 24/24 [00:05<00:00,  4.24trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 25/25 [00:02<00:00, 10.29trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 26/26 [00:01<00:00, 13.81trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 27/27 [00:01<00:00, 15.27trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 28/28 [00:02<00:00, 13.16trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 29/29 [00:03<00:00,  8.59trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 30/30 [00:01<00:00, 15.72trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 31/31 [00:01<00:00, 16.46trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 32/32 [00:01<00:00, 16.99trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 33/33 [00:15<00:00,  2.19trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 34/34 [00:02<00:00, 16.17trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 35/35 [00:01<00:00, 19.54trial/s, best loss: 15205.606957782684]
100%|████████████████████████████████████████████████| 36/36 [00:02<00:00, 12.80trial/s, best loss: 14905.861746001045]
100%|████████████████████████████████████████████████| 37/37 [00:01<00:00, 20.97trial/s, best loss: 14905.861746001045]
100%|████████████████████████████████████████████████| 38/38 [00:35<00:00,  1.06trial/s, best loss: 14905.861746001045]
100%|████████████████████████████████████████████████| 39/39 [00:02<00:00, 14.81trial/s, best loss: 14905.861746001045]
100%|████████████████████████████████████████████████| 40/40 [00:01<00:00, 21.67trial/s, best loss: 14905.861746001045]
100%|███████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.77s/trial, best loss: 16120.76718403548]
100%|████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.63s/trial, best loss: 15696.3021545819]
100%|████████████████████████████████████████████████████| 3/3 [00:01<00:00,  1.52trial/s, best loss: 15696.3021545819]
100%|████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.17trial/s, best loss: 15696.3021545819]
100%|████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.44trial/s, best loss: 15696.3021545819]
100%|████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.12trial/s, best loss: 15696.3021545819]
100%|██████████████████████████████████████████████████| 7/7 [00:04<00:00,  1.49trial/s, best loss: 15471.537184856066]
100%|██████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.44trial/s, best loss: 15471.537184856066]
100%|██████████████████████████████████████████████████| 9/9 [00:01<00:00,  5.19trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.91trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.44trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 12/12 [00:02<00:00,  5.71trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 13/13 [00:01<00:00,  6.81trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 14/14 [00:02<00:00,  5.64trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 15/15 [00:01<00:00,  8.58trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 16/16 [00:01<00:00,  8.63trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 17/17 [00:01<00:00,  9.74trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 18/18 [00:01<00:00,  9.35trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 19/19 [00:02<00:00,  7.44trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 20/20 [00:02<00:00,  9.42trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 21/21 [00:01<00:00, 11.50trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 22/22 [00:02<00:00,  8.84trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 23/23 [00:01<00:00, 12.06trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 24/24 [00:03<00:00,  6.39trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 25/25 [00:01<00:00, 13.97trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 26/26 [00:02<00:00, 10.98trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 27/27 [00:02<00:00, 11.94trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 28/28 [00:01<00:00, 15.11trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 29/29 [00:01<00:00, 16.44trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 30/30 [00:02<00:00, 13.60trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 31/31 [00:02<00:00, 12.03trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 32/32 [00:04<00:00,  6.49trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 33/33 [00:02<00:00, 14.41trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 34/34 [00:02<00:00, 16.41trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 35/35 [00:01<00:00, 18.58trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 36/36 [00:02<00:00, 15.15trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 37/37 [00:01<00:00, 20.97trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 38/38 [00:03<00:00, 10.14trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 39/39 [00:03<00:00, 11.29trial/s, best loss: 15471.537184856066]
100%|████████████████████████████████████████████████| 40/40 [00:03<00:00, 13.19trial/s, best loss: 15471.537184856066]
100%|██████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.78s/trial, best loss: 21624.126158532737]
100%|██████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.65s/trial, best loss: 20172.864335409668]
100%|██████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.37trial/s, best loss: 20172.864335409668]
100%|██████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.15trial/s, best loss: 20172.864335409668]
100%|██████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.30trial/s, best loss: 20172.864335409668]
100%|██████████████████████████████████████████████████| 6/6 [00:02<00:00,  2.99trial/s, best loss: 20172.864335409668]
100%|██████████████████████████████████████████████████| 7/7 [00:03<00:00,  2.14trial/s, best loss: 20172.864335409668]
100%|███████████████████████████████████████████████████| 8/8 [00:02<00:00,  2.89trial/s, best loss: 19724.89575571509]
100%|███████████████████████████████████████████████████| 9/9 [00:01<00:00,  5.13trial/s, best loss: 19724.89575571509]
100%|█████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.16trial/s, best loss: 19724.89575571509]
100%|█████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.54trial/s, best loss: 19724.89575571509]
100%|█████████████████████████████████████████████████| 12/12 [00:01<00:00,  6.16trial/s, best loss: 19724.89575571509]
100%|█████████████████████████████████████████████████| 13/13 [00:01<00:00,  6.51trial/s, best loss: 19724.89575571509]
100%|█████████████████████████████████████████████████| 14/14 [00:03<00:00,  4.65trial/s, best loss: 19724.89575571509]
100%|█████████████████████████████████████████████████| 15/15 [00:01<00:00,  7.99trial/s, best loss: 19724.89575571509]
100%|█████████████████████████████████████████████████| 16/16 [00:01<00:00,  8.53trial/s, best loss: 19724.89575571509]
100%|████████████████████████████████████████████████| 17/17 [00:01<00:00,  9.60trial/s, best loss: 19650.753242955016]
100%|████████████████████████████████████████████████| 18/18 [00:02<00:00,  8.82trial/s, best loss: 19650.753242955016]
100%|████████████████████████████████████████████████| 19/19 [00:02<00:00,  7.76trial/s, best loss: 19650.753242955016]
100%|████████████████████████████████████████████████| 20/20 [00:02<00:00,  8.98trial/s, best loss: 19650.753242955016]
100%|████████████████████████████████████████████████| 21/21 [00:01<00:00, 11.30trial/s, best loss: 19650.753242955016]
100%|████████████████████████████████████████████████| 22/22 [00:02<00:00,  9.76trial/s, best loss: 19650.753242955016]
100%|████████████████████████████████████████████████| 23/23 [00:02<00:00, 10.21trial/s, best loss: 19650.753242955016]
100%|████████████████████████████████████████████████| 24/24 [00:02<00:00,  9.02trial/s, best loss: 19650.753242955016]
100%|████████████████████████████████████████████████| 25/25 [00:01<00:00, 14.10trial/s, best loss: 19650.753242955016]
100%|████████████████████████████████████████████████| 26/26 [00:02<00:00,  8.97trial/s, best loss: 19650.753242955016]
100%|████████████████████████████████████████████████| 27/27 [00:02<00:00, 12.80trial/s, best loss: 19523.572322926517]
100%|████████████████████████████████████████████████| 28/28 [00:01<00:00, 14.89trial/s, best loss: 19523.572322926517]
100%|████████████████████████████████████████████████| 29/29 [00:01<00:00, 15.23trial/s, best loss: 19523.572322926517]
100%|████████████████████████████████████████████████| 30/30 [00:02<00:00, 12.38trial/s, best loss: 19523.572322926517]
100%|████████████████████████████████████████████████| 31/31 [00:01<00:00, 15.67trial/s, best loss: 19523.572322926517]
100%|████████████████████████████████████████████████| 32/32 [00:02<00:00, 12.54trial/s, best loss: 19523.572322926517]
100%|████████████████████████████████████████████████| 33/33 [00:02<00:00, 14.86trial/s, best loss: 19523.572322926517]
100%|████████████████████████████████████████████████| 34/34 [00:02<00:00, 15.57trial/s, best loss: 19523.572322926517]
100%|█████████████████████████████████████████████████| 35/35 [00:02<00:00, 16.22trial/s, best loss: 19389.84483566683]
100%|█████████████████████████████████████████████████| 36/36 [00:02<00:00, 17.49trial/s, best loss: 19389.84483566683]
100%|█████████████████████████████████████████████████| 37/37 [00:01<00:00, 20.95trial/s, best loss: 19389.84483566683]
100%|█████████████████████████████████████████████████| 38/38 [00:03<00:00, 10.29trial/s, best loss: 19389.84483566683]
100%|█████████████████████████████████████████████████| 39/39 [00:02<00:00, 16.86trial/s, best loss: 19389.84483566683]
100%|█████████████████████████████████████████████████| 40/40 [00:03<00:00, 13.31trial/s, best loss: 19389.84483566683]
In [79]:
tr_df_pps
Out[79]:
mse rmse mae r2
xgBoost 369948129.41386 19234.03570 14006.22849 0.93853
gradBoost 452265579.49150 21266.53661 11906.25366 0.92486
extraTree 242604837.77286 15575.77728 8871.16226 0.95969
adaBoost 831122341.35628 28829.19252 21208.28422 0.86191
In [80]:
val_df_pps
Out[80]:
mse rmse mae r2
xgBoost 633076795.84543 25161.01738 17013.10227 0.90928
gradBoost 720057157.54021 26833.88078 15954.56687 0.89681
extraTree 793798034.63273 28174.42164 17248.01307 0.88624
adaBoost 1047735388.59857 32368.74092 22223.28176 0.84985
In [246]:
# Loading the best performing model -> xgBoost
In [81]:
with open('./Models/fullData/xgBoost.pkl', 'rb') as f:
    xgBoost = pickle.load(f)
In [82]:
xgBoost.best_model()
Out[82]:
{'learner': XGBRegressor(base_score=0.5, booster='gbtree',
              colsample_bylevel=0.7257864312451372, colsample_bynode=1,
              colsample_bytree=0.8388513846452539, gamma=0.0004593643410221875,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.07085553363851324, max_delta_step=0, max_depth=4,
              min_child_weight=10, missing=nan, monotone_constraints='()',
              n_estimators=3400, n_jobs=16, num_parallel_tree=1,
              objective='reg:linear', random_state=0,
              reg_alpha=0.025224287898974726, reg_lambda=2.420823984084491,
              scale_pos_weight=1, seed=0, subsample=0.9186941777766421,
              tree_method='exact', validate_parameters=1, verbosity=None),
 'preprocs': (),
 'ex_preprocs': ()}

Final Thoughts

In [83]:
n_feat = housePrices.shape[1]-1
n_feat_pps = len(pps_features)-1

print("Number of Independent Features in ORIGINAL dataset", n_feat)
print("Number of Independent Features selected by Predictive Power Score", n_feat_pps)
print("Percentage of features omitted using PPS: ", (n_feat-n_feat_pps)/n_feat * 100)
Number of Independent Features in ORIGINAL dataset 80
Number of Independent Features selected by Predictive Power Score 26
Percentage of features omitted using PPS:  67.5
  • In case of an regression, the ppscore uses the mean absolute error (MAE) as the underlying evaluation metric (MAE_model).

  • The best possible score of the MAE is 0 and higher is worse.

  • As a baseline score, we calculate the MAE of a naive model (MAE_naive) that always predicts the median of the target column.

  • The PPS is the result of the following normalization (and never smaller than 0):

    • PPS = 1 - (MAE_model / MAE_naive)
  • Medium article on PPS: Link
  • Metrics for the xgBoost model with all features on validation set :
      - MAE = 15884.68449    
      - R-squared = 0.90460
  • Metrics for the xgBoost model with features selected by PPS :

      - MAE = 17013.10227    
      - R-squared = 0.90928
  • The performance of the model is similar and we have elimated 67.5% of features using PPS.

Test data - Evaluation and Submission

In [84]:
test_all.head(3)
Out[84]:
MSSubClass LotFrontage LotArea LotShape OverallQual OverallCond MasVnrArea ExterQual ExterCond BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF HeatingQC CentralAir 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageFinish GarageCars GarageArea GarageQual GarageCond WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea Fence MiscVal MoSold YrSold Remodel HouseAge TotalSF Total_Bathrooms MSZoning_FV MSZoning_RH MSZoning_RL MSZoning_RM Alley_NotAvailable Alley_Pave LandContour_HLS LandContour_Low LandContour_Lvl LotConfig_CulDSac LotConfig_FR2 LotConfig_FR3 LotConfig_Inside LandSlope_Mod LandSlope_Sev Neighborhood_Blueste Neighborhood_BrDale Neighborhood_BrkSide Neighborhood_ClearCr Neighborhood_CollgCr Neighborhood_Crawfor Neighborhood_Edwards Neighborhood_Gilbert Neighborhood_IDOTRR Neighborhood_MeadowV Neighborhood_Mitchel Neighborhood_NAmes Neighborhood_NPkVill Neighborhood_NWAmes Neighborhood_NoRidge Neighborhood_NridgHt Neighborhood_OldTown Neighborhood_SWISU Neighborhood_Sawyer Neighborhood_SawyerW Neighborhood_Somerst Neighborhood_StoneBr Neighborhood_Timber Neighborhood_Veenker Condition1_Feedr Condition1_Norm Condition1_PosA Condition1_PosN Condition1_RRAe Condition1_RRAn Condition1_RRNe Condition1_RRNn Condition2_Feedr Condition2_Norm Condition2_PosA Condition2_PosN Condition2_RRAe Condition2_RRAn Condition2_RRNn BldgType_2fmCon BldgType_Duplex BldgType_Twnhs BldgType_TwnhsE HouseStyle_1.5Unf HouseStyle_1Story HouseStyle_2.5Fin HouseStyle_2.5Unf HouseStyle_2Story HouseStyle_SFoyer HouseStyle_SLvl RoofStyle_Gable RoofStyle_Gambrel RoofStyle_Hip RoofStyle_Mansard RoofStyle_Shed RoofMatl_CompShg RoofMatl_Membran RoofMatl_Metal RoofMatl_Roll RoofMatl_Tar&Grv RoofMatl_WdShake RoofMatl_WdShngl Exterior1st_AsphShn Exterior1st_BrkComm Exterior1st_BrkFace Exterior1st_CBlock Exterior1st_CemntBd Exterior1st_HdBoard Exterior1st_ImStucc Exterior1st_MetalSd Exterior1st_Plywood Exterior1st_Stone Exterior1st_Stucco Exterior1st_VinylSd Exterior1st_Wd Sdng Exterior1st_WdShing Exterior2nd_AsphShn Exterior2nd_Brk Cmn Exterior2nd_BrkFace Exterior2nd_CBlock Exterior2nd_CmentBd Exterior2nd_HdBoard Exterior2nd_ImStucc Exterior2nd_MetalSd Exterior2nd_Other Exterior2nd_Plywood Exterior2nd_Stone Exterior2nd_Stucco Exterior2nd_VinylSd Exterior2nd_Wd Sdng Exterior2nd_Wd Shng MasVnrType_BrkFace MasVnrType_None MasVnrType_Stone Foundation_CBlock Foundation_PConc Foundation_Slab Foundation_Stone Foundation_Wood Heating_GasA Heating_GasW Heating_Grav Heating_OthW Heating_Wall Electrical_FuseF Electrical_FuseP Electrical_Mix Electrical_SBrkr GarageType_Attchd GarageType_Basment GarageType_BuiltIn GarageType_CarPort GarageType_Detchd GarageType_NoGarage PavedDrive_P PavedDrive_Y MiscFeature_NotAvailable MiscFeature_Othr MiscFeature_Shed MiscFeature_TenC SaleType_CWD SaleType_Con SaleType_ConLD SaleType_ConLI SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
0 1 80.00000 11622 3 5 6 0.00000 3 3 3 3 1 3 468.00000 2 144.00000 270.00000 882.00000 3 1 0 0 896 0.00000 0.00000 0 2 1 3 5 7 0 0 1 1.00000 730.00000 3 3 140 0 0 0 120 0 3 0 6 2010 1 59 1778.00000 1.00000 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
1 1 81.00000 14267 2 6 6 108.00000 3 3 3 3 1 5 923.00000 1 0.00000 406.00000 1329.00000 3 1 0 0 1329 0.00000 0.00000 1 3 1 4 6 7 0 0 1 1.00000 312.00000 3 3 393 36 0 0 0 0 0 12500 6 2010 1 62 2658.00000 1.50000 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
2 6 74.00000 13830 2 5 5 0.00000 3 3 4 3 1 6 791.00000 1 0.00000 137.00000 928.00000 4 1 701 0 1629 0.00000 0.00000 1 3 1 3 6 7 1 3 3 2.00000 482.00000 3 3 212 34 0 0 0 0 3 0 3 2010 0 23 2557.00000 2.50000 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
In [85]:
def test_submissions(model, test_data, scaler, model_name, save=False):
    
    test_cols = test_data.columns
    test_std = scaler.transform(test_data)
    
    
    predictions = model.predict(test_std)

    d = {'Id': test_index, 'SalePrice': np.round(predictions,3)}
    submission = pd.DataFrame(data=d)
    
    if save:
        submission.to_csv(('submission_'+model_name+'.csv'),index=False)
    
    return submission
In [92]:
with open('./Models/fullData/xgBoost.pkl', 'rb') as f:
    xgBoost_all = pickle.load(f)
In [93]:
xgBoost_all.best_model()
Out[93]:
{'learner': XGBRegressor(base_score=0.5, booster='gbtree',
              colsample_bylevel=0.7257864312451372, colsample_bynode=1,
              colsample_bytree=0.8388513846452539, gamma=0.0004593643410221875,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.07085553363851324, max_delta_step=0, max_depth=4,
              min_child_weight=10, missing=nan, monotone_constraints='()',
              n_estimators=3400, n_jobs=16, num_parallel_tree=1,
              objective='reg:linear', random_state=0,
              reg_alpha=0.025224287898974726, reg_lambda=2.420823984084491,
              scale_pos_weight=1, seed=0, subsample=0.9186941777766421,
              tree_method='exact', validate_parameters=1, verbosity=None),
 'preprocs': (),
 'ex_preprocs': ()}
In [94]:
test_all_predictions = test_submissions(xgBoost_all, test_all, scaler_all, 'xgBoost_test', False)
In [87]:
sns.boxplot(test_all_predictions, showmeans=True)
Out[87]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ba6059ce20>
In [95]:
test_all_predictions.to_csv('submission_all.csv', index=False)
api.competition_submit('submission_all.csv','All features','house-prices-advanced-regression-techniques')
100%|█████████████████████████████████████████████████████████████████████████████| 22.6k/22.6k [00:06<00:00, 3.79kB/s]
Out[95]:
Successfully submitted to House Prices - Advanced Regression Techniques
In [96]:
with open('./Models/pps/xgBoost.pkl', 'rb') as f:
    xgBoost_pps = pickle.load(f)
In [97]:
xgBoost_pps.best_model()
Out[97]:
{'learner': XGBRegressor(base_score=0.5, booster='gbtree',
              colsample_bylevel=0.7114781953033733, colsample_bynode=1,
              colsample_bytree=0.57747859483182, gamma=0.02642874339466163,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.008100084805258556, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=2800, n_jobs=16, num_parallel_tree=1,
              objective='reg:linear', random_state=0,
              reg_alpha=0.07468855970394649, reg_lambda=1.0578405336903982,
              scale_pos_weight=1, seed=0, subsample=0.8946561542322565,
              tree_method='exact', validate_parameters=1, verbosity=None),
 'preprocs': (),
 'ex_preprocs': ()}
In [98]:
test_pps_predictions = test_submissions(xgBoost_pps, test_pps, scaler_pps, 'xgBoost_pps', False)
In [99]:
sns.boxplot(test_pps_predictions, showmeans=True)
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ba6ce370d0>
In [100]:
test_pps_predictions.to_csv('submission_pps.csv', index=False)
api.competition_submit('submission_pps.csv','PPS features','house-prices-advanced-regression-techniques')
100%|█████████████████████████████████████████████████████████████████████████████| 22.6k/22.6k [00:05<00:00, 3.94kB/s]
Out[100]:
Successfully submitted to House Prices - Advanced Regression Techniques